from urllib import parse

import scrapy
from scrapy.http import Request
from poetry.items import AuthorItem, ArticleItemLoader, PoetryItem
from scrapy.loader import ItemLoader

base_url = 'https://www.shicimingju.com'

"""
爬取所有诗人信息
"""
class PoemSpider(scrapy.Spider):
    name = 'poem'
    allowed_domains = ['www.shicimingju.com','img.shicimingju.com']
    start_urls = ['https://www.shicimingju.com/category/all']

    def parse(self, response):
        author_unit = response.xpath('//*[contains(@class,"zuozhe_card")]')
        for r in author_unit:
            magnum_opus = r.xpath('./div[@class="zuozhe_good_shici_div"]/a/text()').extract()
            author_url = r.xpath('./div[@class="zuozhe_list_item"]/h3/a/@href').extract_first()
            images = r.xpath('./div[@class="zuozhe_list_item"]/div[@class="zuozhe_list_des"]/img/@src').extract_first()

            if author_url.startswith('http'):
                yield Request(author_url, callback=self.parse2, meta={"url": author_url, "magnum_opus": magnum_opus,"images":images})
            else:
                yield Request('https://www.shicimingju.com' + author_url, callback=self.parse2,
                              meta={"url": author_url, "magnum_opus": magnum_opus,"images":images})
        next_url = response.xpath('//*[@id="list_nav_part"]/a[contains(text(),"下一页")]/@href').extract_first()
        if next_url:
            yield Request(url=parse.urljoin('https://www.shicimingju.com', next_url), callback=self.parse)
        pass

    def parse2(self, response):
        url = response.meta.get("url", "")
        images = response.meta.get("images", "")
        magnum_opus = response.meta.get("magnum_opus", "")
        item_loader = ItemLoader(item=AuthorItem(), response=response)
        author = response.xpath(
            '//*[@id="main_right"]/div[contains(@class,"about_zuozhe")]/div/div/h4/a/text()').extract_first()
        introduction = response.xpath(
            '//*[@id="main_right"]/div[contains(@class,"about_zuozhe")]/div/div/div[@class="des"]').xpath(
            'string(.)').extract_first()
        dynasty = response.xpath(
            '//*[@id="main_right"]/div[contains(@class,"about_zuozhe")]/div/div[@class="aside_left"]/div/a/text()').extract_first()
        item_loader.add_value('author', author)
        item_loader.add_value('url', url)
        item_loader.add_value('dynasty', dynasty)
        item_loader.add_value('introduction', introduction)
        item_loader.add_value('magnum_opus', magnum_opus)
        item_loader.add_value('images', images)
        yield item_loader.load_item()
        pass
